Import helpers and packages



In [33]:

    
import pandas as pd
from autoc import DataExploration, PreProcessor, NaImputer
from autoc.utils.getdata import get_dataset
import numpy as np

# skicit learn 
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_curve, accuracy_score, auc, classification_report

# matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

Playing with titanic data

Two approaches :

Simulate missing data and look if we can impute them correctly using machine learning algo
Impute the real ones and look if it is improving performance on the prediction



In [34]:

    
titanic = get_dataset("titanic")



In [35]:

    
titanic.who.dtype.kind == 'O'









    Out[35]:





True



In [36]:

    
titanic.head()









    Out[36]:






  
    
      
      survived
      pclass
      sex
      age
      sibsp
      parch
      fare
      embarked
      class
      who
      adult_male
      deck
      embark_town
      alive
      alone
    
  
  
    
      0
      0
      3
      male
      22
      1
      0
      7.2500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      False
    
    
      1
      1
      1
      female
      38
      1
      0
      71.2833
      C
      First
      woman
      False
      C
      Cherbourg
      yes
      False
    
    
      2
      1
      3
      female
      26
      0
      0
      7.9250
      S
      Third
      woman
      False
      NaN
      Southampton
      yes
      True
    
    
      3
      1
      1
      female
      35
      1
      0
      53.1000
      S
      First
      woman
      False
      C
      Southampton
      yes
      False
    
    
      4
      0
      3
      male
      35
      0
      0
      8.0500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      True



In [37]:

    
exploration_titanic = DataExploration(titanic)



In [38]:

    
exploration_titanic.print_infos() # there is duplicates here because no id, interesting !









    



{'duplicated_rows': {'action': 'delete',
                     'comment': 'You should delete this rows with df.drop_duplicates()',
                     'level': 'ERROR',
                     'value': Int64Index([ 47,  76,  77,  87,  95, 101, 121, 133, 173, 196,
            ...
            838, 844, 846, 859, 863, 870, 877, 878, 884, 886],
           dtype='int64', length=107)}}



In [39]:

    
exploration_titanic.nacolcount()









    Out[39]:






  
    
      
      Nanumber
      Napercentage
    
  
  
    
      survived
      0
      0.000000
    
    
      pclass
      0
      0.000000
    
    
      sex
      0
      0.000000
    
    
      age
      177
      0.198653
    
    
      sibsp
      0
      0.000000
    
    
      parch
      0
      0.000000
    
    
      fare
      0
      0.000000
    
    
      embarked
      2
      0.002245
    
    
      class
      0
      0.000000
    
    
      who
      0
      0.000000
    
    
      adult_male
      0
      0.000000
    
    
      deck
      688
      0.772166
    
    
      embark_town
      2
      0.002245
    
    
      alive
      0
      0.000000
    
    
      alone
      0
      0.000000



In [40]:

    
exploration_titanic.structure()









    Out[40]:






  
    
      
      dtypes_p
      dtypes_r
      nb_missing
      perc_missing
      nb_unique_values
      constant_columns
      na_columns
      is_key
      dtype_infer
      string_length
    
  
  
    
      survived
      int64
      numeric
      0
      0.000000
      2
      False
      False
      False
      integer
      NaN
    
    
      pclass
      int64
      numeric
      0
      0.000000
      3
      False
      False
      False
      integer
      NaN
    
    
      sex
      object
      factor
      0
      0.000000
      2
      False
      False
      False
      string
      6
    
    
      age
      float64
      numeric
      177
      0.198653
      88
      False
      False
      False
      floating
      NaN
    
    
      sibsp
      int64
      numeric
      0
      0.000000
      7
      False
      False
      False
      integer
      NaN
    
    
      parch
      int64
      numeric
      0
      0.000000
      7
      False
      False
      False
      integer
      NaN
    
    
      fare
      float64
      numeric
      0
      0.000000
      248
      False
      False
      False
      floating
      NaN
    
    
      embarked
      object
      factor
      2
      0.002245
      3
      False
      False
      False
      mixed
      1
    
    
      class
      object
      factor
      0
      0.000000
      3
      False
      False
      False
      string
      6
    
    
      who
      object
      factor
      0
      0.000000
      3
      False
      False
      False
      string
      5
    
    
      adult_male
      bool
      factor
      0
      0.000000
      2
      False
      False
      False
      boolean
      NaN
    
    
      deck
      object
      factor
      688
      0.772166
      7
      False
      False
      False
      mixed
      1
    
    
      embark_town
      object
      factor
      2
      0.002245
      3
      False
      False
      False
      mixed
      11
    
    
      alive
      object
      factor
      0
      0.000000
      2
      False
      False
      False
      string
      3
    
    
      alone
      bool
      factor
      0
      0.000000
      2
      False
      False
      False
      boolean
      NaN



In [41]:

    
titanic.corr()



In [42]:

    
titanic.loc[titanic.age.isnull(),:].head(5)









    Out[42]:






  
    
      
      survived
      pclass
      sex
      age
      sibsp
      parch
      fare
      embarked
      class
      who
      adult_male
      deck
      embark_town
      alive
      alone
    
  
  
    
      5
      0
      3
      male
      NaN
      0
      0
      8.4583
      Q
      Third
      man
      True
      NaN
      Queenstown
      no
      True
    
    
      17
      1
      2
      male
      NaN
      0
      0
      13.0000
      S
      Second
      man
      True
      NaN
      Southampton
      yes
      True
    
    
      19
      1
      3
      female
      NaN
      0
      0
      7.2250
      C
      Third
      woman
      False
      NaN
      Cherbourg
      yes
      True
    
    
      26
      0
      3
      male
      NaN
      0
      0
      7.2250
      C
      Third
      man
      True
      NaN
      Cherbourg
      no
      True
    
    
      28
      1
      3
      female
      NaN
      0
      0
      7.8792
      Q
      Third
      woman
      False
      NaN
      Queenstown
      yes
      True

Preprocessing data



In [43]:

    
preprocessor = PreProcessor(titanic)



In [44]:

    
preprocessor.infer_subtypes()









    Out[44]:





{'adult_male': {'dtype': dtype('bool'), 'subtype': 'binary'},
 'age': {'dtype': dtype('float64'), 'subtype': None},
 'alive': {'dtype': dtype('O'), 'subtype': 'binary'},
 'alone': {'dtype': dtype('bool'), 'subtype': 'binary'},
 'class': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'deck': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'embark_town': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'embarked': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'fare': {'dtype': dtype('float64'), 'subtype': None},
 'parch': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'pclass': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'sex': {'dtype': dtype('O'), 'subtype': 'binary'},
 'sibsp': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'survived': {'dtype': dtype('int64'), 'subtype': 'binary'},
 'who': {'dtype': dtype('O'), 'subtype': 'text_categorical'}}



In [45]:

    
titanic = titanic.drop('alive', axis = 1)

Transform everything to numeric variables for skicit learn model

The dataset is not clean enough to be directly transformed as a numpy array and used for ml with skicit learn

The target variable (alive) is not a a integer dummy variable (0 or 1)
The values are heterogeneous (string labels for categories, integers and floating point numbers). Numpy array for skicit learn only supports integers and floating point numbers.
Some observations are missing (np.nan)



In [46]:

    
features_full = pd.concat([titanic.loc[:, ['fare', 'age', 'pclass', 'sibsp', 'parch']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who'),
                      pd.get_dummies(titanic['alone'], prefix='alone'),
                      pd.get_dummies(titanic['embarked'], prefix='embarked')],
                     axis=1)



In [47]:

    
features = pd.concat([titanic[['fare', 'age', 'pclass']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who'),
                      pd.get_dummies(titanic['embarked'], prefix='embarked')],
                     axis=1)



In [48]:

    
target = titanic.survived

Simple Model to understand variable importances

Prepare data



In [49]:

    
# Impute missing values 
imp = NaImputer(features_full)
features_full = imp.basic_naimputation(['age']) # this is still a pandas Dataframe but imputed 
target = titanic.survived



In [50]:

    
# Creating test train 
features_train, features_test, target_train, target_test = train_test_split(
    features_full.values, target.values, test_size=0.25, random_state=0)

Logistic Regression



In [51]:

    
logreg = LogisticRegression(C=1)
logreg.fit(features_train, target_train)
target_pred = logreg.predict(features_test)


feature_names = features_full.columns
print("Accuracy : {}".format(accuracy_score(target_test, target_pred)))
weights = logreg.coef_.flatten()
dict_weights = {k:v for k,v in zip(feature_names, weights)}









    



Accuracy : 0.811659192825



In [52]:

    
def plot_simple_imp(imp, features_names,sort = True, absolute=False):
    serie = pd.Series(index=feature_names, data=imp)
    if absolute : 
        serie = np.abs(serie)
    if sort :
        serie.sort_values(inplace=True, ascending=False)
    serie.plot(kind='barh')



In [53]:

    
plot_simple_imp(weights, feature_names)



In [ ]:



In [54]:

    
# Looking at weights 
feature_names = features_full.columns

def plot_abs_weights(coeff_arr, features_name, title=None,legend_size=12,figsize=(15,7)):
    coeff_arr = np.abs(coeff_arr)# take absolute value
    coeff_arr.sort()
    plt.figure(figsize=figsize)
    plt.barh(range(len(feature_names)), coeff_arr)
    plt.yticks(range(len(feature_names)), feature_names, size=legend_size)
    if title:
        plt.title(title)



In [55]:

    
plot_abs_weights(logreg.coef_.ravel(), feature_names, title="Absolute Coefficient Logistic Regression")

Random Forest



In [56]:

    
rf_full = RandomForestClassifier(n_estimators=500)
rf_full.fit(features_train, target_train)









    Out[56]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [57]:

    
rf_full.score(features_test, target_test)









    Out[57]:





0.81165919282511212



In [58]:

    
plot_simple_imp(rf_full.feature_importances_, feature_names)

Studying simple imputation technique for accuracy performance

Here we are trying to predict who survives using with the variable age having natural missing values

Deletion of row containing missing values
Deletion of col containig missing values
Imputation of missing values with median
Imputation of missing values with -1



In [130]:

    
def rf_cv(features, target,random_state=1, n_estimators=200,scoring='accuracy',n_jobs=4, verbose=True):
    """ Print scores of a random forest cross validation  """
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    scores = cross_val_score(rf, features, target, cv=4,scoring=scoring,n_jobs=4)
    if verbose :
        print("Random Forest CV scores:min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
                scores.min(), scores.mean(), scores.max()))
    return scores

def logreg_cv(features, target, scoring='accuracy',n_jobs=4, verbose=True):
    """ Print scores of a  forest cross validation  """
    logreg = LogisticRegression(C=1)
    scores = cross_val_score(logreg, features, target, cv=4,scoring=scoring,n_jobs=4)
    if verbose : 
        print("Logistic Regression CV scores: min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
                scores.min(), scores.mean(), scores.max()))
    return scores
    

def plot_roc_curve(target_test, target_predicted_proba):
    fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1])
    
    roc_auc = auc(fpr, tpr)
    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate or (1 - Specifity)')
    plt.ylabel('True Positive Rate or (Sensitivity)')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")

Row Deletion strategy



In [131]:

    
# selecting index and transforming into numpy array 
index_missing_age = features.age.isnull()
features_rm_a, target_rm_a = features.loc[~index_missing_age, :].values, target[~index_missing_age].values



In [132]:

    
features_rm_a.shape









    Out[132]:





(714, 11)



In [133]:

    
rf_cv(features_rm_a, target_rm_a, scoring='accuracy')









    



Random Forest CV scores:min: 0.726, mean: 0.784, max: 0.816






    Out[133]:





array([ 0.72625698,  0.81564246,  0.80898876,  0.78651685])



In [134]:

    
features_train_rm, features_test_rm, target_train_rm, target_test_rm = train_test_split(
    features_rm_a, target_rm_a, test_size=0.25, random_state=0)



In [135]:

    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_rm, target_train_rm)
target_predicted_proba = rf.predict_proba(features_test_rm)
plot_roc_curve(target_test_rm, target_predicted_proba)



In [136]:

    
rf_cv(features_rm_a, target_rm_a,random_state=0, scoring='roc_auc')









    



Random Forest CV scores:min: 0.812, mean: 0.849, max: 0.865






    Out[136]:





array([ 0.81209615,  0.85655208,  0.86130765,  0.86458333])

Col Deletion strategy



In [137]:

    
# selecting index and transforming into numpy array 
features_cm_a, target_cm_a = features.drop('age', axis =1).values, target.values



In [138]:

    
features_cm_a.shape









    Out[138]:





(891, 10)



In [139]:

    
rf_cv(features_cm_a, target_cm_a, scoring="accuracy")









    



Random Forest CV scores:min: 0.759, mean: 0.809, max: 0.833






    Out[139]:





array([ 0.75892857,  0.8206278 ,  0.83333333,  0.82432432])



In [140]:

    
features_train_cm, features_test_cm, target_train_cm, target_test_cm = train_test_split(
    features_cm_a, target_cm_a, test_size=0.25, random_state=0)



In [141]:

    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_cm, target_train_cm)
target_predicted_proba = rf.predict_proba(features_test_cm)
plot_roc_curve(target_test_cm, target_predicted_proba)



In [142]:

    
rf_cv(features_cm_a, target_cm_a, scoring="roc_auc")









    



Random Forest CV scores:min: 0.805, mean: 0.850, max: 0.880






    Out[142]:





array([ 0.80535895,  0.88032592,  0.85169601,  0.86225848])

Median Imputation strategy



In [143]:

    
# selecting index and transforming into numpy array 

features_imp = features.copy()
features.shape









    Out[143]:





(891, 11)



In [144]:

    
# features_imp.loc[:,'is_na_age'] =features_imp.age.isnull().astype(int)
# imp = NaImputer(features) # creating our imputer instance
# features_imp = imp.basic_naimputation(columns_to_process=['age'])



In [145]:

    
features_imp = features.fillna(-1)



In [146]:

    
features_imp_a, target_imp_a = features_imp.values, target.values



In [147]:

    
rf_cv(features_imp_a, target_imp_a, scoring='accuracy')









    



Random Forest CV scores:min: 0.763, mean: 0.804, max: 0.821






    Out[147]:





array([ 0.76339286,  0.8206278 ,  0.81531532,  0.81531532])



In [148]:

    
features_train_imp, features_test_imp, target_train_imp, target_test_imp = train_test_split(
    features_imp_a, target_imp_a, test_size=0.25, random_state=0)



In [149]:

    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_imp, target_train_imp)
target_predicted_proba = rf.predict_proba(features_test_imp)
plot_roc_curve(target_test_imp, target_predicted_proba)



In [150]:

    
rf_cv(features_imp_a, target_imp_a, scoring='roc_auc')









    



Random Forest CV scores:min: 0.819, mean: 0.852, max: 0.873






    Out[150]:





array([ 0.81905123,  0.87260227,  0.85234006,  0.86225848])



In [151]:

    
rf.feature_importances_









    Out[151]:





array([ 0.28061501,  0.2436398 ,  0.0967306 ,  0.06874228,  0.09025391,
        0.01820227,  0.11959495,  0.04632542,  0.01161832,  0.0097387 ,
        0.01453875])

Simulating missing values in a column and see imputation performance

The purpose of this section is to simulate missing values in a important variable like .. and see the decrease of performance and the ability to correcly impute the missing value

We start using the features_full dataset



In [152]:

    
# constructing features 
features_imp = pd.concat([titanic[['pclass']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who')],axis=1)



In [153]:

    
features_imp.pclass.value_counts()









    Out[153]:





3    491
1    216
2    184
Name: pclass, dtype: int64

Comparing simulated model and raw model



In [156]:

    
#scores_imp = logreg_cv(features_imp.drop('pclass',1), target)



In [174]:

    
def insert_na(features_full=features_imp,target=target,index=False,
              col_to_simulate='pclass', pct_na_toinsert=0.2, verbose=False):
    """ Returns dataset with a certain pct of na injected in one colum """
    nb_na_toinsert = int(pct_na_toinsert * len(features_full))
    index_na_toinsert = np.random.choice(range(len(features_full)),nb_na_toinsert, replace=False)
    if verbose:
        print("We are inserting {} missing values".format(len(index_na_toinsert)))
    features_full_imp = features_full.copy()
    if index :
        return index_na_toinsert
    else:
        features_full_imp.loc[index_na_toinsert, col_to_simulate] = np.nan
        return features_full_imp

def score_rf_sim(features_full=features_imp,target=target,
                 col_to_simulate='pclass', pct_na_toinsert=0.2, n_repeat=10,verbose=False, *args, **kwargs):
    """ Inserting a percentage of missing values on a variable and look influence on performance 
    with a random forest model """
    features_full_imp = insert_na(features_full,target=target,
                                  col_to_simulate=col_to_simulate, pct_na_toinsert=pct_na_toinsert,verbose=verbose)
    imp_f = NaImputer(features_full_imp)
    features_full_imp.loc[:,col_to_simulate] = imp_f.fillna_serie(colname=col_to_simulate)
    # repeated cross validation 
#     score_rcv = 0
#     for i in range(n_repeat):
#         score_rcv += logreg_cv(features_full_imp, target,*args, **kwargs).mean()
    return logreg_cv(features_full_imp, target, verbose=False).mean()



In [175]:

    
score_rf_sim(col_to_simulate='pclass', verbose=True)









    



We are inserting 178 missing values






    Out[175]:





0.79793164500984004



In [195]:

    
accuracy_mean_pct_na = np.array([score_rf_sim(
            pct_na_toinsert=i,col_to_simulate='pclass',verbose=True) for i in np.linspace(0,0.98,10)])









    



We are inserting 0 missing values
We are inserting 97 missing values
We are inserting 194 missing values
We are inserting 291 missing values
We are inserting 388 missing values
We are inserting 485 missing values
We are inserting 582 missing values
We are inserting 679 missing values
We are inserting 776 missing values
We are inserting 873 missing values



In [179]:

    
def sim_nmc(nmc=60,n_interval=5, *args, **kwargs):
    
    res = np.zeros(n_interval)
    for i in range(nmc):
        res +=  np.array([score_rf_sim(
            pct_na_toinsert=i, *args, **kwargs) for i in np.linspace(0,0.98,n_interval)])
    return res/nmc



In [180]:

    
test = sim_nmc(nmc=30, n_interval=5)



In [181]:

    
test









    Out[181]:





array([ 0.80017885,  0.79718608,  0.78924172,  0.78587207,  0.78358648])



In [182]:

    
np.linspace(0,0.98,5)









    Out[182]:





array([ 0.   ,  0.245,  0.49 ,  0.735,  0.98 ])



In [183]:

    
plt.plot(np.linspace(0,0.98,5), test)
plt.title('Accuracy function of percentage of missing values inserted')









    Out[183]:





<matplotlib.text.Text at 0x1170ab590>

Trying to predict missing values

We start with all features to have better prediciton power



In [184]:

    
features_pred = features_full.copy().drop_duplicates()
features_pred = features_pred.drop('age', axis = 1)



In [185]:

    
index_na = insert_na(col_to_simulate='pclass', pct_na_toinsert=0.2, index=True)



In [186]:

    
index_na = features_pred.index.isin(index_na)



In [187]:

    
features_pred.head()









    Out[187]:






  
    
      
      fare
      pclass
      sibsp
      parch
      sex_female
      sex_male
      who_child
      who_man
      who_woman
      alone_False
      alone_True
      embarked_C
      embarked_Q
      embarked_S
    
  
  
    
      0
      7.2500
      3
      1
      0
      0
      1
      0
      1
      0
      1
      0
      0
      0
      1
    
    
      1
      71.2833
      1
      1
      0
      1
      0
      0
      0
      1
      1
      0
      1
      0
      0
    
    
      2
      7.9250
      3
      0
      0
      1
      0
      0
      0
      1
      0
      1
      0
      0
      1
    
    
      3
      53.1000
      1
      1
      0
      1
      0
      0
      0
      1
      1
      0
      0
      0
      1
    
    
      4
      8.0500
      3
      0
      0
      0
      1
      0
      1
      0
      0
      1
      0
      0
      1



In [188]:

    
target = features_pred.pclass
features_pred = features_pred.drop('pclass', axis = 1)



In [189]:

    
features_pred_train, target_pred_target = features_pred.loc[~index_na,:], target[~index_na]
features_pred_test, target_pred_test = features_pred.loc[index_na,:], target[index_na]



In [190]:

    
features_pred_train.shape









    Out[190]:





(614, 13)



In [191]:

    
features_pred_test.head()









    Out[191]:






  
    
      
      fare
      sibsp
      parch
      sex_female
      sex_male
      who_child
      who_man
      who_woman
      alone_False
      alone_True
      embarked_C
      embarked_Q
      embarked_S
    
  
  
    
      7
      21.075
      3
      1
      0
      1
      1
      0
      0
      1
      0
      0
      0
      1
    
    
      17
      13.000
      0
      0
      0
      1
      0
      1
      0
      0
      1
      0
      0
      1
    
    
      20
      26.000
      0
      0
      0
      1
      0
      1
      0
      0
      1
      0
      0
      1
    
    
      24
      21.075
      3
      1
      1
      0
      1
      0
      0
      1
      0
      0
      0
      1
    
    
      26
      7.225
      0
      0
      0
      1
      0
      1
      0
      0
      1
      1
      0
      0



In [192]:

    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_pred_train, target_pred_target)
target_predicted = rf.predict(features_pred_test)
target_predicted_proba = rf.predict_proba(features_pred_test)



In [194]:

    
print(classification_report(target_pred_test, target_predicted))









    



             precision    recall  f1-score   support

          1       0.97      0.92      0.94        37
          2       0.85      0.82      0.84        28
          3       0.93      0.96      0.94        79

avg / total       0.92      0.92      0.92       144

	survived	pclass	age	sibsp	parch	fare	adult_male	alone
survived	1.000000	-0.338481	-0.077221	-0.035322	0.081629	0.257307	-0.557080	-0.203367
pclass	-0.338481	1.000000	-0.369226	0.083081	0.018443	-0.549500	0.094035	0.135207
age	-0.077221	-0.369226	1.000000	-0.308247	-0.189119	0.096067	0.280328	0.198270
sibsp	-0.035322	0.083081	-0.308247	1.000000	0.414838	0.159651	-0.253586	-0.584471
parch	0.081629	0.018443	-0.189119	0.414838	1.000000	0.216225	-0.349943	-0.583398
fare	0.257307	-0.549500	0.096067	0.159651	0.216225	1.000000	-0.182024	-0.271832
adult_male	-0.557080	0.094035	0.280328	-0.253586	-0.349943	-0.182024	1.000000	0.404744
alone	-0.203367	0.135207	0.198270	-0.584471	-0.583398	-0.271832	0.404744	1.000000

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

	Nanumber	Napercentage
survived	0	0.000000
pclass	0	0.000000
sex	0	0.000000
age	177	0.198653
sibsp	0	0.000000
parch	0	0.000000
fare	0	0.000000
embarked	2	0.002245
class	0	0.000000
who	0	0.000000
adult_male	0	0.000000
deck	688	0.772166
embark_town	2	0.002245
alive	0	0.000000
alone	0	0.000000

	dtypes_p	dtypes_r	nb_missing	perc_missing	nb_unique_values	constant_columns	na_columns	is_key	dtype_infer	string_length
survived	int64	numeric	0	0.000000	2	False	False	False	integer	NaN
pclass	int64	numeric	0	0.000000	3	False	False	False	integer	NaN
sex	object	factor	0	0.000000	2	False	False	False	string	6
age	float64	numeric	177	0.198653	88	False	False	False	floating	NaN
sibsp	int64	numeric	0	0.000000	7	False	False	False	integer	NaN
parch	int64	numeric	0	0.000000	7	False	False	False	integer	NaN
fare	float64	numeric	0	0.000000	248	False	False	False	floating	NaN
embarked	object	factor	2	0.002245	3	False	False	False	mixed	1
class	object	factor	0	0.000000	3	False	False	False	string	6
who	object	factor	0	0.000000	3	False	False	False	string	5
adult_male	bool	factor	0	0.000000	2	False	False	False	boolean	NaN
deck	object	factor	688	0.772166	7	False	False	False	mixed	1
embark_town	object	factor	2	0.002245	3	False	False	False	mixed	11
alive	object	factor	0	0.000000	2	False	False	False	string	3
alone	bool	factor	0	0.000000	2	False	False	False	boolean	NaN

	survived	pclass	sex	age	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
5	0	3	male	NaN	8.4583	Q	Third	man	True	NaN	Queenstown	no	True
17	1	2	male	NaN	13.0000	S	Second	man	True	NaN	Southampton	yes	True
19	1	3	female	NaN	7.2250	C	Third	woman	False	NaN	Cherbourg	yes	True
26	0	3	male	NaN	7.2250	C	Third	man	True	NaN	Cherbourg	no	True
28	1	3	female	NaN	7.8792	Q	Third	woman	False	NaN	Queenstown	yes	True

	fare	sibsp	parch	sex_female	sex_male	who_child	who_man	alone_False	alone_True	embarked_C	embarked_S
7	21.075	3	1	0	1	1	0	1	0	0	1
17	13.000	0	0	0	1	0	1	0	1	0	1
20	26.000	0	0	0	1	0	1	0	1	0	1
24	21.075	3	1	1	0	1	0	1	0	0	1
26	7.225	0	0	0	1	0	1	0	1	1	0